{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a7f18f06",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import spacy\n",
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from string import punctuation\n",
    "small_model = spacy.load(\"en_core_web_sm\")\n",
    "large_model = spacy.load(\"en_core_web_lg\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37db5d80",
   "metadata": {},
   "source": [
    "# Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5962079c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/zhenya/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting en-core-web-sm==3.6.0\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hRequirement already satisfied: spacy<3.7.0,>=3.6.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from en-core-web-sm==3.6.0) (3.6.1)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (3.0.12)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (1.0.4)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (1.0.9)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.0.7)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (3.0.8)\n",
      "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (8.1.12)\n",
      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (1.1.2)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.4.7)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.0.9)\n",
      "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (0.9.0)\n",
      "Requirement already satisfied: pathy>=0.10.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (0.10.2)\n",
      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (6.4.0)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (4.66.1)\n",
      "Requirement already satisfied: numpy>=1.15.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (1.24.4)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.31.0)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.3.0)\n",
      "Requirement already satisfied: jinja2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (3.1.2)\n",
      "Requirement already satisfied: setuptools in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (68.2.0)\n",
      "Requirement already satisfied: packaging>=20.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (23.1)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (3.3.0)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (0.5.0)\n",
      "Requirement already satisfied: pydantic-core==2.6.3 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.6.3)\n",
      "Requirement already satisfied: typing-extensions>=4.6.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (4.7.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (3.2.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (1.26.16)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2023.7.22)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (0.7.10)\n",
      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (0.1.2)\n",
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (8.1.7)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from jinja2->spacy<3.7.0,>=3.6.0->en-core-web-sm==3.6.0) (2.1.3)\n",
      "Installing collected packages: en-core-web-sm\n",
      "Successfully installed en-core-web-sm-3.6.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
      "You can now load the package via spacy.load('en_core_web_sm')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /home/zhenya/nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting en-core-web-lg==3.6.0\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m587.7/587.7 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:05\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: spacy<3.7.0,>=3.6.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from en-core-web-lg==3.6.0) (3.6.1)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (3.0.12)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (1.0.4)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (1.0.9)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.0.7)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (3.0.8)\n",
      "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (8.1.12)\n",
      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (1.1.2)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.4.7)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.0.9)\n",
      "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (0.9.0)\n",
      "Requirement already satisfied: pathy>=0.10.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (0.10.2)\n",
      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (6.4.0)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (4.66.1)\n",
      "Requirement already satisfied: numpy>=1.15.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (1.24.4)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.31.0)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.3.0)\n",
      "Requirement already satisfied: jinja2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (3.1.2)\n",
      "Requirement already satisfied: setuptools in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (68.2.0)\n",
      "Requirement already satisfied: packaging>=20.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (23.1)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (3.3.0)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (0.5.0)\n",
      "Requirement already satisfied: pydantic-core==2.6.3 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.6.3)\n",
      "Requirement already satisfied: typing-extensions>=4.6.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (4.7.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (3.2.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (1.26.16)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2023.7.22)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (0.7.10)\n",
      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (0.1.2)\n",
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (8.1.7)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /home/zhenya/.cache/pypoetry/virtualenvs/book-2nd-ed-tI3JII_H-py3.10/lib/python3.10/site-packages (from jinja2->spacy<3.7.0,>=3.6.0->en-core-web-lg==3.6.0) (2.1.3)\n",
      "Installing collected packages: en-core-web-lg\n",
      "Successfully installed en-core-web-lg-3.6.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
      "You can now load the package via spacy.load('en_core_web_lg')\n"
     ]
    }
   ],
   "source": [
    "# Downloads if necessary\n",
    "#nltk.download('punkt')\n",
    "#!python -m spacy download en_core_web_sm\n",
    "#nltk.download('averaged_perceptron_tagger')\n",
    "#!python -m spacy download en_core_web_lg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "018907eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def word_tokenize_nltk(text):\n",
    "    return nltk.tokenize.word_tokenize(text)\n",
    "\n",
    "def word_tokenize_spacy(text, model):\n",
    "    doc = model(text)\n",
    "    return [token.text for token in doc]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30a43f30",
   "metadata": {},
   "source": [
    "# Subjects and objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d9c5169",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_subject_phrase(doc):\n",
    "    for token in doc:\n",
    "        if (\"subj\" in token.dep_):\n",
    "            subtree = list(token.subtree)\n",
    "            start = subtree[0].i\n",
    "            end = subtree[-1].i + 1\n",
    "            return doc[start:end]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e8c2f7be",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_object_phrase(doc):\n",
    "    for token in doc:\n",
    "        if (\"dobj\" in token.dep_):\n",
    "            subtree = list(token.subtree)\n",
    "            start = subtree[0].i\n",
    "            end = subtree[-1].i + 1\n",
    "            return doc[start:end]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "672e51a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dative_phrase(doc):\n",
    "    for token in doc:\n",
    "        if (\"dative\" in token.dep_):\n",
    "            subtree = list(token.subtree)\n",
    "            start = subtree[0].i\n",
    "            end = subtree[-1].i + 1\n",
    "            return doc[start:end]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9f487248",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_prepositional_phrase_objs(doc):\n",
    "    prep_spans = []\n",
    "    for token in doc:\n",
    "        if (\"pobj\" in token.dep_):\n",
    "            subtree = list(token.subtree)\n",
    "            start = subtree[0].i\n",
    "            end = subtree[-1].i + 1\n",
    "            prep_spans.append(doc[start:end])\n",
    "    return prep_spans"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ca761c7",
   "metadata": {},
   "source": [
    "# Clean data functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7e282272",
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = list(stopwords.words('english'))\n",
    "stop_words.append(\"``\")\n",
    "stop_words.append(\"'s\")\n",
    "\n",
    "def remove_stopwords_helper(x):\n",
    "    new_list = [w for w in x if w not in stop_words and w not in punctuation]\n",
    "    return new_list\n",
    "\n",
    "def tokenize(input_df, column_name):\n",
    "    input_df[column_name + \"_tokenized\"] = input_df[column_name].apply(word_tokenize)\n",
    "    return input_df\n",
    "\n",
    "def remove_stopword_punct(input_df, column_name):\n",
    "    input_df[column_name] = input_df[column_name].apply(remove_stopwords_helper)\n",
    "    return input_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "537bff79",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
